{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from scipy.stats import gmean\n",
    "%matplotlib inline\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.metrics import roc_auc_score\n",
    "from tqdm import tqdm_notebook as tqdm\n",
    "from toolkit.postprocessing import BlendingOptimizer\n",
    "\n",
    "sys.path.append('../')\n",
    "from src.utils import read_oof_predictions, calculate_rank\n",
    "\n",
    "PROJECT_DIR = 'PATH/TO/YOUR/EXPERIMENT'\n",
    "FIRST_LEVEL_PREDICTIONS_DIR = os.path.join(PROJECT_DIR,'files','out_of_fold_predictions','first_level')\n",
    "SECOND_LEVEL_PREDICTIONS_DIR = os.path.join(PROJECT_DIR,'files','out_of_fold_predictions','second_level')\n",
    "MISC_PREDICTIONS_DIR = os.path.join(PROJECT_DIR,'files','out_of_fold_predictions','misc')\n",
    "TRAIN_FILEPATH = os.path.join(PROJECT_DIR,'files','unzipped_data','application_train.csv')\n",
    "OUTPUT_DIR = 'PATH/TO/YOUR/OUTPUT'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_oof, test_oof = read_oof_predictions(FIRST_LEVEL_PREDICTIONS_DIR, TRAIN_FILEPATH, 'SK_ID_CURR','TARGET')\n",
    "train_oof_second, test_oof_second = read_oof_predictions(SECOND_LEVEL_PREDICTIONS_DIR, TRAIN_FILEPATH, 'SK_ID_CURR','TARGET')\n",
    "train_oof_misc, test_oof_misc = read_oof_predictions(MISC_PREDICTIONS_DIR, TRAIN_FILEPATH, 'SK_ID_CURR','TARGET')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_oof.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Predictions\n",
    "## Correlations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "X_cols = [col for col in train_oof.columns if '_cv_' in col]\n",
    "X, y = train_oof[X_cols], train_oof['TARGET']\n",
    "X_corr = pd.concat([X,y], axis=1).corr()\n",
    "display(X_corr.sort_values('TARGET', ascending=False)['TARGET'])\n",
    "plt.figure(figsize=(16,12))\n",
    "sns.heatmap(X_corr, xticklabels=X_corr.columns, yticklabels=X_corr.columns, \n",
    "            annot=True,vmin=0.0,vmax=1.0)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Ranks "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def transform_to_ranks(train_oof, test_oof):\n",
    "    X_cols = [col for col in train_oof.columns if '_cv_' in col]\n",
    "    train_oof_rank, test_oof_rank = [],[]\n",
    "    for fold_id in range(5):\n",
    "        train_oof_fold = train_oof[train_oof['fold_id']==fold_id]\n",
    "        test_oof_fold = test_oof[test_oof['fold_id']==fold_id]\n",
    "        for col in X_cols:\n",
    "            train_oof_fold[col] = calculate_rank(train_oof_fold[col])\n",
    "            test_oof_fold[col] = calculate_rank(test_oof_fold[col])\n",
    "        train_oof_rank.append(train_oof_fold)\n",
    "        test_oof_rank.append(test_oof_fold)\n",
    "\n",
    "    train_oof_rank = pd.concat(train_oof_rank, axis=0)\n",
    "    test_oof_rank = pd.concat(test_oof_rank, axis=0)\n",
    "    return train_oof_rank, test_oof_rank\n",
    "\n",
    "train_oof_rank, test_oof_rank = transform_to_ranks(train_oof, test_oof)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_cols = [col for col in train_oof_rank.columns if '_cv_' in col]\n",
    "X, y = train_oof_rank[X_cols], train_oof_rank['TARGET']\n",
    "X_corr = pd.concat([X,y], axis=1).corr()\n",
    "display(X_corr.sort_values('TARGET', ascending=False)['TARGET'])\n",
    "plt.figure(figsize=(16,12))\n",
    "sns.heatmap(X_corr, xticklabels=X_corr.columns, yticklabels=X_corr.columns, \n",
    "            annot=True,vmin=0.0,vmax=1.0)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Weights optimization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "blender = BlendingOptimizer(metric=roc_auc_score, maximize=True)\n",
    "\n",
    "blender.fit(X=X.transpose(), y=y.tolist(),\n",
    "            step_size = 0.25, init_weights = None, warm_start = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_cols = [col for col in train_oof_rank.columns if '_cv_' in col]\n",
    "y_pred = blender.transform(train_oof_rank[X_cols].transpose())['y_pred']\n",
    "roc_auc_score(y, y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred_test = blender.transform(test_oof_rank[X_cols].transpose())['y_pred']\n",
    "\n",
    "def create_submission(y_pred_test, test):\n",
    "    test_predictions = test[['SK_ID_CURR','fold_id']]\n",
    "    test_predictions['TARGET'] = y_pred_test\n",
    "    submission = []\n",
    "    for fold_id, fold_df in test_predictions.groupby('fold_id'):\n",
    "        fold_df['TARGET'] = calculate_rank(fold_df['TARGET'])\n",
    "        submission.append(fold_df)\n",
    "    submission = pd.concat(submission, axis=0)\n",
    "    submission = submission.groupby('SK_ID_CURR')['TARGET'].apply(np.mean).reset_index()\n",
    "    return submission\n",
    "\n",
    "submission_first = create_submission(y_pred_test, test_oof_rank)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission_first.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Second level models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_oof_second_rank, test_oof_second_rank = transform_to_ranks(train_oof_second, \n",
    "                                                                 test_oof_second)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_cols = [col for col in train_oof_second_rank.columns if '_cv_' in col]\n",
    "X, y = train_oof_second_rank[X_cols], train_oof_second_rank['TARGET']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "blender_second = BlendingOptimizer(metric=roc_auc_score, maximize=True)\n",
    "\n",
    "blender_second.fit(X=X.transpose(), y=y.tolist(),\n",
    "            step_size = 0.05, init_weights = None, warm_start = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = blender_second.transform(train_oof_second_rank[X_cols].transpose())['y_pred']\n",
    "roc_auc_score(y, y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred_test_second = blender_second.transform(test_oof_second_rank[X_cols].transpose())['y_pred']\n",
    "\n",
    "submission_second = create_submission(y_pred_test_second, test_oof_second_rank)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission_second.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Misc Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_oof_misc_rank, test_oof_misc_rank = transform_to_ranks(train_oof_misc, test_oof_misc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_oof_misc_rank.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission_misc = create_submission(test_oof_misc_rank['hc_11095_cv_7957_lb_805'], test_oof_misc_rank)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission_misc.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Average First+Second+Misc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission = submission_first.copy()\n",
    "\n",
    "sub_first = submission_first['TARGET']\n",
    "sub_second = submission_second['TARGET']\n",
    "sub_misc = submission_misc['TARGET']\n",
    "\n",
    "submission['TARGET'] = (1.0 * sub_first + 1.0 * sub_second + 1.0 * sub_misc)/3."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Submission"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission.to_csv(os.path.join(OUTPUT_DIR,'submission.csv'),index=None)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
